Load packages
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(skimr)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
Read in data from CSV file (download to project folder)
airsat <- read_csv("airsatisfaction.csv") %>% print()
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## satisfaction = col_character(),
## sex = col_character(),
## customer_type = col_character(),
## travel_type = col_character(),
## class = col_character()
## )
## i Use `spec()` for the full column specifications.
## # A tibble: 10,000 x 23
## satisfaction sex customer_type age travel_type class flight_distance
## <chr> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 dissatisfied Female loyal 39 Business Business 2620
## 2 dissatisfied Female loyal 66 Business Business 2364
## 3 satisfied Female loyal 8 Personal Eco 1828
## 4 dissatisfied Male loyal 43 Business Eco 3564
## 5 dissatisfied Female disloyal 26 Business Eco 2040
## 6 dissatisfied Male loyal 29 Personal Eco Plus 2439
## 7 satisfied Female loyal 44 Business Business 858
## 8 dissatisfied Male disloyal 39 Business Business 1610
## 9 dissatisfied Male loyal 65 Personal Business 691
## 10 satisfied Female loyal 40 Business Business 2889
## # ... with 9,990 more rows, and 16 more variables: seat_comfort <dbl>,
## # time_convenience <dbl>, food_drink <dbl>, gate_location <dbl>,
## # inflight_wifi <dbl>, inflight_entertainment <dbl>, online_support <dbl>,
## # ease_booking <dbl>, onboard_service <dbl>, leg_room <dbl>,
## # baggage_handling <dbl>, checkin_service <dbl>, cleanliness <dbl>,
## # online_boarding <dbl>, departure_delay <dbl>, arrival_delay <dbl>
Create training and testing sets
set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.8, list = FALSE)
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]
airsat_train
airsat_test
Explore the training set
skim(airsat_train)
| Name | airsat_train |
| Number of rows | 8001 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 18 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| satisfaction | 0 | 1 | 9 | 12 | 0 | 2 | 0 |
| sex | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| customer_type | 0 | 1 | 5 | 8 | 0 | 2 | 0 |
| travel_type | 0 | 1 | 8 | 8 | 0 | 2 | 0 |
| class | 0 | 1 | 3 | 8 | 0 | 3 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age | 0 | 1 | 39.52 | 14.95 | 7 | 27 | 40 | 51 | 85 | ▃▇▇▅▁ |
| flight_distance | 0 | 1 | 1987.42 | 1037.12 | 50 | 1364 | 1918 | 2559 | 6595 | ▃▇▃▁▁ |
| seat_comfort | 0 | 1 | 2.88 | 1.39 | 0 | 2 | 3 | 4 | 5 | ▇▇▇▇▅ |
| time_convenience | 0 | 1 | 2.99 | 1.52 | 0 | 2 | 3 | 4 | 5 | ▇▆▆▇▇ |
| food_drink | 0 | 1 | 2.90 | 1.43 | 0 | 2 | 3 | 4 | 5 | ▇▇▇▇▆ |
| gate_location | 0 | 1 | 3.00 | 1.31 | 1 | 2 | 3 | 4 | 5 | ▅▆▇▇▅ |
| inflight_wifi | 0 | 1 | 3.25 | 1.32 | 0 | 2 | 3 | 4 | 5 | ▃▇▇▇▇ |
| inflight_entertainment | 0 | 1 | 3.42 | 1.33 | 0 | 3 | 4 | 4 | 5 | ▂▃▅▇▆ |
| online_support | 0 | 1 | 3.53 | 1.31 | 1 | 3 | 4 | 5 | 5 | ▃▃▅▇▇ |
| ease_booking | 0 | 1 | 3.48 | 1.31 | 1 | 2 | 4 | 5 | 5 | ▃▅▅▇▇ |
| onboard_service | 0 | 1 | 3.48 | 1.27 | 1 | 3 | 4 | 4 | 5 | ▂▃▅▇▆ |
| leg_room | 0 | 1 | 3.48 | 1.30 | 0 | 2 | 4 | 5 | 5 | ▂▅▅▇▇ |
| baggage_handling | 0 | 1 | 3.70 | 1.16 | 1 | 3 | 4 | 5 | 5 | ▂▂▃▇▆ |
| checkin_service | 0 | 1 | 3.34 | 1.27 | 1 | 3 | 3 | 4 | 5 | ▃▃▇▇▆ |
| cleanliness | 0 | 1 | 3.70 | 1.15 | 1 | 3 | 4 | 5 | 5 | ▁▂▃▇▆ |
| online_boarding | 0 | 1 | 3.35 | 1.31 | 1 | 2 | 4 | 4 | 5 | ▃▅▇▇▇ |
| departure_delay | 0 | 1 | 14.32 | 36.88 | 0 | 0 | 0 | 12 | 569 | ▇▁▁▁▁ |
| arrival_delay | 34 | 1 | 14.56 | 37.15 | 0 | 0 | 0 | 13 | 543 | ▇▁▁▁▁ |
Create and prep recipe
airsat_recipe <-
airsat %>%
recipe(satisfaction ~ .) %>%
step_nzv(all_predictors()) %>%
step_lincomb(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_pca(all_numeric_predictors(), threshold = 0.9) %>%
step_dummy(all_nominal_predictors()) %>%
prep(training = airsat_train, log_changes = TRUE)
## step_nzv (nzv_AY9ua):
## removed (2): departure_delay, arrival_delay
##
## step_lincomb (lincomb_AySWE): same number of columns
##
## step_normalize (normalize_V6vGz): same number of columns
##
## step_pca (pca_wPZrK):
## new (11): PC01, PC02, PC03, PC04, PC05, PC06, PC07, PC08, PC09, PC10, ...
## removed (16): age, flight_distance, seat_comfort, time_convenience, ...
##
## step_dummy (dummy_5NO9d):
## new (5): sex_Male, customer_type_loyal, travel_type_Personal, class_Eco, ...
## removed (4): sex, customer_type, travel_type, class
Bake new training set
airsat_baked_train <- bake(airsat_recipe, new_data = airsat_train) %>% print()
## # A tibble: 8,001 x 17
## satisfaction PC01 PC02 PC03 PC04 PC05 PC06 PC07 PC08
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 dissatisfied -1.15 -0.759 0.759 0.808 1.46 0.537 0.645 -0.195
## 2 satisfied 2.61 -0.466 -1.72 1.11 -1.17 0.169 -1.55 0.645
## 3 dissatisfied -4.02 -1.50 -0.578 0.225 -0.182 1.48 -0.260 0.269
## 4 dissatisfied -2.49 1.17 -0.922 0.771 -0.372 -2.31 -0.188 -0.537
## 5 satisfied 1.30 0.121 -0.0532 -1.16 -0.598 0.0442 -0.499 0.119
## 6 satisfied 1.95 1.68 -0.982 0.529 0.247 0.596 0.930 -0.0723
## 7 satisfied 1.03 -1.64 3.46 0.144 0.496 1.96 -0.750 -0.561
## 8 satisfied 4.07 -1.69 -0.354 -1.20 1.16 -0.463 -0.101 0.214
## 9 dissatisfied -1.87 -0.569 0.125 0.707 1.03 1.01 1.00 -0.220
## 10 dissatisfied 0.830 2.56 1.18 -0.811 1.21 -2.19 0.729 0.635
## # ... with 7,991 more rows, and 8 more variables: PC09 <dbl>, PC10 <dbl>,
## # PC11 <dbl>, sex_Male <dbl>, customer_type_loyal <dbl>,
## # travel_type_Personal <dbl>, class_Eco <dbl>, class_Eco.Plus <dbl>
Bake new testing set
airsat_baked_test <- bake(airsat_recipe, new_data = airsat_test) %>% print()
## # A tibble: 1,999 x 17
## satisfaction PC01 PC02 PC03 PC04 PC05 PC06 PC07 PC08 PC09
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 dissatisfied -0.0375 -1.71 1.42 -0.734 1.23 0.477 1.80 -0.529 -0.289
## 2 dissatisfied -1.34 -1.98 1.02 1.09 -0.159 -0.547 1.00 -1.16 0.424
## 3 dissatisfied -0.611 1.80 -0.977 -0.406 -0.620 -0.563 -0.337 1.14 1.01
## 4 dissatisfied 3.14 -0.204 -0.823 -2.00 -0.308 0.439 -0.207 0.263 1.22
## 5 satisfied -3.41 1.42 0.348 -0.648 -1.16 0.235 0.494 2.17 0.462
## 6 satisfied 2.03 -2.01 -1.96 1.39 1.85 0.614 -0.593 0.450 -0.445
## 7 dissatisfied -1.36 1.71 0.360 -0.676 -0.705 0.0296 -1.06 -0.817 0.580
## 8 satisfied 0.107 1.29 1.89 0.600 -1.01 -0.422 1.75 -0.162 0.283
## 9 dissatisfied 2.34 -0.535 0.321 2.65 1.40 -0.711 -0.271 0.391 -0.561
## 10 satisfied -0.368 -2.88 -2.23 -1.59 0.741 -1.10 -0.538 -0.646 1.41
## # ... with 1,989 more rows, and 7 more variables: PC10 <dbl>, PC11 <dbl>,
## # sex_Male <dbl>, customer_type_loyal <dbl>, travel_type_Personal <dbl>,
## # class_Eco <dbl>, class_Eco.Plus <dbl>
Modify the code above to accomplish the following goals:
Use 75% of the data for training and 25% of the data for testing.
Apply the Yeo-Johnson transformation to the flight_distance variable (before normalizing it).
Instead of using PCA to address multicollinearity, drop highly correlated predictors.
Use one-hot encoding for the nominal predictors instead of dummy codes.
set.seed(2021)
index <- createDataPartition(airsat$satisfaction, p = 0.75, list = FALSE) #1
airsat_train <- airsat[index, ]
airsat_test <- airsat[-index, ]
airsat_train
airsat_test
airsat_recipe <-
airsat %>%
recipe(satisfaction ~ .) %>%
step_nzv(all_predictors()) %>%
step_lincomb(all_numeric_predictors()) %>%
step_YeoJohnson(flight_distance) %>% #2
step_normalize(all_numeric_predictors()) %>%
step_corr(all_numeric_predictors()) %>% #3
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% #4
prep(training = airsat_train, log_changes = TRUE)
## step_nzv (nzv_fnnfs):
## removed (2): departure_delay, arrival_delay
##
## step_lincomb (lincomb_s6zuF): same number of columns
##
## step_YeoJohnson (YeoJohnson_gMFXy): same number of columns
##
## step_normalize (normalize_7QqN7): same number of columns
##
## step_corr (corr_NMHen): same number of columns
##
## step_dummy (dummy_b1gwO):
## new (9): sex_Female, sex_Male, customer_type_disloyal, ...
## removed (4): sex, customer_type, travel_type, class